/* Copyright 2012 Tobias Marschall
 * 
 * This file is part of CLEVER.
 * 
 * CLEVER is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CLEVER is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CLEVER.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <limits>
#include <cassert>
#include <ctime>

#include <boost/program_options.hpp>
#include <boost/tokenizer.hpp>
#include <boost/lexical_cast.hpp>

#include <bamtools/api/BamReader.h>

#include "AlignmentPair.h"
#include "PackedAlignmentPair.h"
#include "GaussianEdgeCalculator.h"
#include "CliqueFinder.h"
#include "CliqueWriter.h"
#include "CoverageMonitor.h"
#include "CoverageWriter.h"
#include "HistogramBasedDistribution.h"
#include "AnyDistributionEdgeCalculator.h"
#include "ReadSetSignificanceTester.h"
#include "ReadSetZTester.h"
#include "ReadSetGenericTester.h"
#include "ReadGroups.h"
#include "ReadGroupAwareEdgeCalculator.h"
#include "ReadSetGroupWiseZTester.h"
#include "VersionInfo.h"

using namespace std;
using namespace boost;
namespace po = boost::program_options;

void usage(const char* name, const po::options_description& options_desc) {
	cerr << "Usage: " << name << " [options] <distribution-file>" << endl;
	cerr << endl;
	cerr << "<distribution-file> file with assumed internal segment length distribution." << endl;
	cerr << "                    in default mode, this is a file containing one line with" << endl;
	cerr << "                    mean and standard deviation of the normal distribution" << endl;
	cerr << "                    to be used. Such a file can be generated using option -m" << endl;
	cerr << "                    of insert-length-histogram. Note that the \"internal segment\"" << endl;
	cerr << "                    does NOT include the read (ends), i.e. a fragment disjointly" << endl;
	cerr << "                    consists of two reads (read ends) and an internal segment." << endl;
	cerr << "                    If option -A is given, a two-column text file is expected" << endl;
	cerr << "                    with columns <readgroup-name> <distribution-file>." << endl;
	cerr << "                    If option -d is given, an arbitrary distribution file as produced" << endl;
	cerr << "                    as main output of insert-length-histogram can be used." << endl;
	cerr << "                    (Note, however, that this is an experimental feature.)" << endl;
	cerr << endl;
	cerr << "Reads alignment pairs from stdin in computes all cliques. Expected format:" << endl;
	cerr << "<read-name> <pair-nr> <read-group> <phred-sum1> <chrom1> <start1> <end1> <strand1> <phred-sum2> <chrom2> <start2> <end2> <strand2> <aln-pair-prob> <aln-pair-prob-inslength>" << endl;
	cerr << endl;
	cerr << "NOTE: Alignment pairs are assumed to be ordered by field 7 (end1)." << endl;
	cerr << endl;
	cerr << "Outputs all significant variant predictions (sorted by p-value) after controlling FDR." << endl;
	cerr << endl;
	cerr << options_desc << endl;
	exit(1);
}

bool read_mean_and_sd(const string& filename, double* mean, double* sd) {
	typedef boost::tokenizer<boost::char_separator<char> > tokenizer_t;
	boost::char_separator<char> whitespace_separator(" \t");
	ifstream in(filename.c_str());
	string line;
	if (in.fail() || (!getline(in,line))) {
		return false;
	}
	tokenizer_t tokenizer(line,whitespace_separator);
	vector<string> tokens(tokenizer.begin(), tokenizer.end());
	if (tokens.size() != 2) {
		return false;
	}
	try {
		*mean = boost::lexical_cast<double>(tokens[0]);
		*sd = boost::lexical_cast<double>(tokens[1]);
	} catch(boost::bad_lexical_cast &){
		return false;
	}
	return true;
}

auto_ptr<vector<mean_and_stddev_t> > read_readgroup_list(const string& filename, const ReadGroups& read_groups) {
	auto_ptr<vector<mean_and_stddev_t> > result(new vector<mean_and_stddev_t>(read_groups.size(), mean_and_stddev_t()));
	ifstream is(filename.c_str());
	if (is.fail()) {
		cerr << "Error: could not open \"" << filename << "\"." << endl;
		exit(1);
	}
	typedef boost::tokenizer<boost::char_separator<char> > tokenizer_t;
	boost::char_separator<char> whitespace_separator(" \t");
	string line;
	int linenr = 0;
	while (getline(is,line)) {
		linenr += 1;
		tokenizer_t tokenizer(line, whitespace_separator);
		vector<string> tokens(tokenizer.begin(), tokenizer.end());
		if (tokens.size() != 2) {
			ostringstream oss;
			oss << "Error parsing read group list. Offending line: " << linenr;
			throw std::runtime_error(oss.str());
		}
		int rg = read_groups.getIndex(tokens[0]);
		if (rg < 0) {
			ostringstream oss;
			oss << "Error parsing read group list. Read group \"" << tokens[0] << "\" unknown in line " << linenr << ".";
			throw std::runtime_error(oss.str());
		}
		HistogramBasedDistribution distribution(tokens[1]);
		distribution.estimateGaussian(&(result->at(rg).mean), &(result->at(rg).stddev));
	}
	return result;
}

int main(int argc, char* argv[]) {
	VersionInfo::checkAndPrintVersion("clever-core", cerr);
	string commandline = VersionInfo::commandline(argc, argv);

	// PARAMETERS
	double edge_sig_level;
	double min_aln_weight;
	double max_insert_length;
	int max_coverage;
	string edge_filename;
	double fdr;
	string reads_output_filename;
	string coverage_output_filename;
	bool verbose = false;
	bool arbitrary_distribution = false;
	string cached_dist_filename;
	string read_group_bam_filename = "";
	bool multisample = false;
	bool readgroup_aware = false;

	po::options_description options_desc("Allowed options");
	options_desc.add_options()
	    ("verbose,v", po::value<bool>(&verbose)->zero_tokens(), "Be verbose: output additional statistics for each variation.")
		("edge_sig_level,p", po::value<double>(&edge_sig_level)->default_value(0.2), "Significance level for edges (the lower the level, the more edges will be present).")
		("min_aln_weight,w", po::value<double>(&min_aln_weight)->default_value(0.0016), "Minimum weight of alignment pairs to be considered.")
		("max_insert_length,l", po::value<double>(&max_insert_length)->default_value(50000), "Maximum insert length of alignments to be considered (0=unlimited).")
		("max_coverage,c", po::value<int>(&max_coverage)->default_value(200), "Maximum allowed coverage. If exceeded, violating reads are ignored. The number of such ignored reads is printed to stderr (0=unlimited).")
		("write_edges,e", po::value<string>(&edge_filename)->default_value(""), "Write edges to file of given name.")
		("fdr,f", po::value<double>(&fdr)->default_value(0.1), "False discovery rate (FDR).")
		("all,a", "Output all cliques instead of only the significant ones. Cliques are not sorted and last column (FDR) is not computed.")
		("output_reads,r", po::value<string>(&reads_output_filename)->default_value(""), "Output reads belonging to at least one significant clique to the given filename (along with their assignment to significant cliques.")
		("output_coverage,C", po::value<string>(&coverage_output_filename)->default_value(""), "Output the coverage with considered insert segments along the chromosome (one line per position) to the given filename.")
		("readgroup_aware,A", po::value<bool>(&readgroup_aware)->zero_tokens(), "Use a separate mean and standard deviations per read group. If given, argument <distribution-file> must refer to a file containing this information.")
		("arbitrary_dist,d", po::value<bool>(&arbitrary_distribution)->zero_tokens(), "Use a given (arbitrary) distribution instead of a normal distribution as null model (EXPERIMENTAL).")
		("cached_dist,D", po::value<string>(&cached_dist_filename)->default_value(""), "Load cached distributions as precomputed with tool \"precompute-distributions\". Does not change results, but increases speed. Only applicable when using option -d.")
		("read_groups,R", po::value<string>(&read_group_bam_filename)->default_value(""), "BAM file from whose header read group information is to be read, required if options -A or -S are to be used.")
		("multisample,S", po::value<bool>(&multisample)->zero_tokens(), "Read groups come from multiple samples as indicated by SM fields in BAM header. Evaluate cliques sample wise.")
	;
	
	if (isatty(fileno(stdin)) || (argc<2)) {
		usage(argv[0], options_desc);
	}
	string distribution_filename(argv[argc-1]);
	argc -= 1;

	po::variables_map options;
	try {
		po::store(po::parse_command_line(argc, argv, options_desc), options);
		po::notify(options);
	} catch(std::exception& e) {
		cerr << "error: " << e.what() << "\n";
		return 1;
	}
	cerr << "Commandline: " << commandline << endl;

	bool output_all = options.count("all")>0;
	if (output_all && (reads_output_filename.size()>0)) {
		cerr << "Error: options -a and -r are mutually exclusive." << endl;
		return 1;
	}

	if (!arbitrary_distribution && (cached_dist_filename.size() > 0)) {
		cerr << "Error: options -D can only be used in connection with option -d." << endl;
		return 1;
	}

	if (multisample && (read_group_bam_filename.size()==0)) {
		cerr << "Error: option -S can only be used together with option -R." << endl;
		return 1;
	}

	if (readgroup_aware && (read_group_bam_filename.size()==0)) {
		cerr << "Error: option -A requires option -R." << endl;
		return 1;
	}

	clock_t clock_start = clock();
	EdgeCalculator* edge_calculator = 0;
	ReadSetSignificanceTester* significance_tester = 0;
	HistogramBasedDistribution* insert_length_distribution = 0;
	VariationCaller* variation_caller = 0;
	ReadGroups* read_groups = 0;
	if (read_group_bam_filename.size() > 0) {
		BamTools::BamReader bam_reader;
		if (!bam_reader.Open(read_group_bam_filename)) {
			cerr << "Could not read BAM file from \"" << read_group_bam_filename << "\"." << endl;
			return 1;
		}
		if (!bam_reader.GetHeader().HasReadGroups()) {
			cerr << "No read group information found in header of BAM file \"" << read_group_bam_filename << "\"." << endl;
			return 1;
		}
		read_groups = new ReadGroups(bam_reader.GetHeader().ReadGroups);
		if (!multisample && (read_groups->sampleCount() > 1)) {
			cerr << "Warning: multiple samples present according to BAM header. Input is treated as one sample." << endl;
			cerr << "         If this was not intended, please prepare single sample BAMs (preferred) or use option -S." << endl;
		}
		if (read_groups->sampleCount() > 5) {
			cerr << "Too many samples! CLEVER cannot be used for more than 5 samples, at present." << endl;
			return 1;
		}
		cerr << "Loaded read group information from \"" << read_group_bam_filename << "\":" << endl;
		cerr << *read_groups;
	}
	auto_ptr<vector<mean_and_stddev_t> > readgroup_params(0);
	if (readgroup_aware) {
		try {
			readgroup_params = read_readgroup_list(distribution_filename, *read_groups);
		} catch (std::runtime_error& e) {
			cerr << "Error: " << e.what() << endl;
			return 1;
		}
		assert(readgroup_params->size() == read_groups->size());
		cerr << "Read-group-aware mode turned on. Read group statistics:" << endl;
		for (size_t i=0; i<read_groups->size(); ++i) {
			cerr << "Read group " << read_groups->getName(i) << ": mean " << readgroup_params->at(i).mean << ", stddev " << readgroup_params->at(i).stddev << endl;
		}
		edge_calculator = new ReadGroupAwareEdgeCalculator(edge_sig_level,*readgroup_params);
		significance_tester = new ReadSetGroupWiseZTester(*readgroup_params);
		variation_caller = new VariationCaller(readgroup_params.get(), *significance_tester);
	} else if (arbitrary_distribution) {
		try {
			insert_length_distribution = new HistogramBasedDistribution(distribution_filename);
		} catch (std::runtime_error& e) {
			cerr << "Error: " << e.what() << endl;
			return 1;
		}
		edge_calculator = new AnyDistributionEdgeCalculator(edge_sig_level, *insert_length_distribution);
		ReadSetGenericTester* st = new ReadSetGenericTester(*insert_length_distribution);
		if (cached_dist_filename.size() > 0) {
			try {
				ifstream ifs(cached_dist_filename.c_str());
				if (ifs.fail()) {
					cerr << "Error opening " << cached_dist_filename << endl;
					return 1;
				}
				st->readCachedDistributions(ifs);
				ifs.close();
				cerr << "Done loading cached distributions from file \"" << cached_dist_filename << "\"" << endl;
			} catch (std::runtime_error& e) {
				cerr << "Error: " << e.what() << endl;
				return 1;
			}
		}
		significance_tester = st;
		int median = insert_length_distribution->getMedian();
		cerr << "Median internal segment length: " << median << endl;
		variation_caller = new VariationCaller(median, *significance_tester);
	} else {
		double insert_mean = -1.0;
		double insert_stddev = -1.0;
		if (!read_mean_and_sd(distribution_filename, &insert_mean, &insert_stddev)) {
			cerr << "Error reading \"" << distribution_filename << "\"." << endl;
			return 1;
		}
		cerr << "Null distribution: mean " << insert_mean << ", sd " <<  insert_stddev << endl;
		edge_calculator = new GaussianEdgeCalculator(edge_sig_level,insert_mean,insert_stddev);
		significance_tester = new ReadSetZTester(insert_mean, insert_stddev);
		variation_caller = new VariationCaller(insert_mean, *significance_tester);
	}
	CliqueWriter clique_writer(cout, *variation_caller, read_groups, multisample, output_all, fdr, verbose);
	CliqueFinder clique_finder(*edge_calculator, clique_writer, read_groups);
	EdgeWriter* edge_writer = 0;
	ofstream* edge_ofstream = 0;
	if (edge_filename.size()>0) {
		edge_ofstream = new ofstream(edge_filename.c_str());
		edge_writer = new EdgeWriter(*edge_ofstream);
		clique_finder.setEdgeWriter(*edge_writer);
	}
	ofstream* reads_ofstream = 0;
	if (reads_output_filename.size()>0) {
		reads_ofstream = new ofstream(reads_output_filename.c_str());
		clique_writer.enableReadListOutput(*reads_ofstream);
	}
	CoverageWriter* coverage_writer = 0;
	if (coverage_output_filename.size()>0) {
		coverage_writer = new CoverageWriter(coverage_output_filename);
	}

	size_t last_pos = 0;
	int n = 0;
	string line;
	size_t skipped_by_weight = 0;
	size_t skipped_by_length = 0;
	size_t skipped_by_coverage = 0;
	size_t valid_alignments = 0;
	size_t total_alignments = 0;
	while (getline(cin, line)) {
		n += 1;
		total_alignments += 1;
		try {
			AlignmentPair ap(line, read_groups);
			if (ap.getEnd1()+1 > ap.getStart2()-1) continue;
			if (ap.getChrom1().compare(ap.getChrom2()) != 0) continue;
			if (ap.getStrand1().compare(ap.getStrand2()) == 0) continue;
			if (ap.getEnd1()<last_pos) {
				cerr << "Error: Input is not ordered by position (field 7)! Offending line: " << n << endl;
				return 1;
			}
			valid_alignments += 1;
			last_pos = ap.getEnd1();
			auto_ptr<PackedAlignmentPair> alignment_autoptr(new PackedAlignmentPair(ap));
			if (max_insert_length>0) {
				if (alignment_autoptr->getInsertLength() > max_insert_length) {
					skipped_by_length += 1;
					continue;
				}
			}
			if (alignment_autoptr->getWeight() < min_aln_weight) {
				// cout << "Skipping alignment (weight): "  << alignment_autoptr->getName() << " weight: " << alignment_autoptr->getWeight() << endl;
				skipped_by_weight += 1;
				continue;
			}
			if (max_coverage>0) {
				if (clique_finder.getCoverageMonitor().probeAlignment(*alignment_autoptr) > (size_t)max_coverage) {
					// cout << "Skipping alignment (coverage): "  << alignment_autoptr->getName()  << endl;
					skipped_by_coverage += 1;
					continue;
				}
			}
			if (coverage_writer != 0) {
				coverage_writer->addAlignment(*alignment_autoptr);
			}
			clique_finder.addAlignment(alignment_autoptr);
		} catch (std::runtime_error&) {
			cerr << "Error parsing input, offending line: " << n << endl;
			return 1;
		}
	}
	clique_finder.finish();
	clique_writer.finish();

	cerr << "Total alignments: " << total_alignments << endl;
	cerr << "Valid alignments: " << valid_alignments << endl;
	cerr << "Alignments with too low weight (skipped): " << skipped_by_weight << endl;
	cerr << "Alignments with too large insert length (skipped): " << skipped_by_length << endl;
	cerr << "Alignments skipped due to coverage constraints: " << skipped_by_coverage << endl;
	cerr << "Total number of cliques: " << clique_writer.getTotalCount() << endl;

	delete edge_calculator;
	delete variation_caller;
	delete significance_tester;
	delete insert_length_distribution;
	if (edge_writer != 0) {
		delete edge_writer;
		delete edge_ofstream;
	}
	if (reads_ofstream!=0) {
		delete reads_ofstream;
	}
	if (coverage_writer != 0) {
		coverage_writer->finish();
		delete coverage_writer;
	}
	double cpu_time = (double)(clock() - clock_start) / CLOCKS_PER_SEC;
	cerr << "Total CPU time: " << cpu_time << endl;
	return 0;
}
